{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 机器学习示例\n",
    "darui 19/08\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 0.载入sklearn工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import SVC\n",
    "from sklearn.linear_model import LogisticRegression \n",
    "import numpy as np\n",
    "import pandas \n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import cross_val_score, GridSearchCV\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import preprocessing\n",
    "# load libraries\n",
    "import pandas\n",
    "import numpy as np \n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn import ensemble\n",
    "from sklearn import tree\n",
    "from sklearn import metrics\n",
    "from sklearn import linear_model \n",
    "from sklearn import naive_bayes \n",
    "from sklearn.svm import SVC\n",
    "from sklearn import preprocessing\n",
    "import time\n",
    "from matplotlib import pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.cross_decomposition import PLSRegression\n",
    "from sklearn.metrics import roc_curve, auc\n",
    "from sklearn.decomposition import PCA\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_validate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. 数据前处理\n",
    "## 1.1 载入数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 38519 entries, 0 to 38518\n",
      "Data columns (total 9 columns):\n",
      "power             38519 non-null float16\n",
      "range             38519 non-null float16\n",
      "angle             38519 non-null float16\n",
      "velocity          38519 non-null float16\n",
      "status_1          38519 non-null float16\n",
      "status_2          38519 non-null float16\n",
      "status_3          38519 non-null float16\n",
      "status_4          38519 non-null float16\n",
      "weak detection    38519 non-null float16\n",
      "dtypes: float16(9)\n",
      "memory usage: 677.2 KB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "X = pandas.read_csv(\"features.csv\",dtype='float16')\n",
    "Y  = pandas.read_csv(\"labels.csv\")   \n",
    "print(X.info(memory_usage=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.2 探索数据分析 (exploratory data analysis)\n",
    "### 1.2.1 看看datasize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training set:\n",
      "# data: 38519\n",
      "# features: 9\n",
      "# passed: 8263\n",
      "% failure rate: 21.452%\n"
     ]
    }
   ],
   "source": [
    "# training set\n",
    "print (\"Training set:\")\n",
    "n_data  = len(features)\n",
    "# exclude first colum ID and last column label (substract 2)\n",
    "n_features = features.shape[1]\n",
    "# number of satisfied customers\n",
    "n_all = len(labels) \n",
    "# number of dissatisfied customers\n",
    "n_1 = np.sum(labels)\n",
    "n_1 = n_1['labels']\n",
    "sat_rate = 100.0*n_1/n_all\n",
    "print (\"# data: {}\".format(n_all))\n",
    "print (\"# features: {}\".format(n_features))\n",
    "print (\"# passed: {}\".format(n_1))\n",
    "print (\"% failure rate: {:.3f}%\".format(sat_rate))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.2.2 看看数据的统计数值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>power</th>\n",
       "      <th>range</th>\n",
       "      <th>angle</th>\n",
       "      <th>velocity</th>\n",
       "      <th>status_1</th>\n",
       "      <th>status_2</th>\n",
       "      <th>status_3</th>\n",
       "      <th>status_4</th>\n",
       "      <th>weak detection</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>3.851900e+04</td>\n",
       "      <td>38519.0000</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>3.851900e+04</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>inf</td>\n",
       "      <td>inf</td>\n",
       "      <td>0.577637</td>\n",
       "      <td>-inf</td>\n",
       "      <td>0.495117</td>\n",
       "      <td>0.098755</td>\n",
       "      <td>0.101379</td>\n",
       "      <td>0.304932</td>\n",
       "      <td>0.057953</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>7.804688e+00</td>\n",
       "      <td>inf</td>\n",
       "      <td>7.808594</td>\n",
       "      <td>1.756250e+01</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.298340</td>\n",
       "      <td>0.301758</td>\n",
       "      <td>0.460449</td>\n",
       "      <td>0.233643</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>3.050000e+01</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>-36.843750</td>\n",
       "      <td>-8.193750e+01</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>4.700000e+01</td>\n",
       "      <td>37.0625</td>\n",
       "      <td>-2.800781</td>\n",
       "      <td>-1.742188e+01</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>5.200000e+01</td>\n",
       "      <td>76.3750</td>\n",
       "      <td>0.399902</td>\n",
       "      <td>-7.320312e+00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>5.750000e+01</td>\n",
       "      <td>127.4375</td>\n",
       "      <td>3.650391</td>\n",
       "      <td>-4.199219e-01</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>8.450000e+01</td>\n",
       "      <td>618.5000</td>\n",
       "      <td>33.562500</td>\n",
       "      <td>8.193750e+01</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              power       range         angle      velocity      status_1  \\\n",
       "count  3.851900e+04  38519.0000  38519.000000  3.851900e+04  38519.000000   \n",
       "mean            inf         inf      0.577637          -inf      0.495117   \n",
       "std    7.804688e+00         inf      7.808594  1.756250e+01      0.500000   \n",
       "min    3.050000e+01      0.0000    -36.843750 -8.193750e+01      0.000000   \n",
       "25%    4.700000e+01     37.0625     -2.800781 -1.742188e+01      0.000000   \n",
       "50%    5.200000e+01     76.3750      0.399902 -7.320312e+00      0.000000   \n",
       "75%    5.750000e+01    127.4375      3.650391 -4.199219e-01      1.000000   \n",
       "max    8.450000e+01    618.5000     33.562500  8.193750e+01      1.000000   \n",
       "\n",
       "           status_2      status_3      status_4  weak detection  \n",
       "count  38519.000000  38519.000000  38519.000000    38519.000000  \n",
       "mean       0.098755      0.101379      0.304932        0.057953  \n",
       "std        0.298340      0.301758      0.460449        0.233643  \n",
       "min        0.000000      0.000000      0.000000        0.000000  \n",
       "25%        0.000000      0.000000      0.000000        0.000000  \n",
       "50%        0.000000      0.000000      0.000000        0.000000  \n",
       "75%        0.000000      0.000000      1.000000        0.000000  \n",
       "max        1.000000      1.000000      1.000000        1.000000  "
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## summary\n",
    "X.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>power</th>\n",
       "      <th>range</th>\n",
       "      <th>angle</th>\n",
       "      <th>velocity</th>\n",
       "      <th>status_1</th>\n",
       "      <th>status_2</th>\n",
       "      <th>status_3</th>\n",
       "      <th>status_4</th>\n",
       "      <th>weak detection</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "      <td>38519.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>52.683208</td>\n",
       "      <td>102.321735</td>\n",
       "      <td>0.578047</td>\n",
       "      <td>-7.799162</td>\n",
       "      <td>0.495002</td>\n",
       "      <td>0.098730</td>\n",
       "      <td>0.101327</td>\n",
       "      <td>0.304940</td>\n",
       "      <td>0.057945</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>7.805274</td>\n",
       "      <td>93.259543</td>\n",
       "      <td>7.811847</td>\n",
       "      <td>17.561186</td>\n",
       "      <td>0.499982</td>\n",
       "      <td>0.298304</td>\n",
       "      <td>0.301765</td>\n",
       "      <td>0.460388</td>\n",
       "      <td>0.233643</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>30.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-36.850000</td>\n",
       "      <td>-81.920000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>47.000000</td>\n",
       "      <td>37.070000</td>\n",
       "      <td>-2.800000</td>\n",
       "      <td>-17.420000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>52.000000</td>\n",
       "      <td>76.400000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>-7.320000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>57.500000</td>\n",
       "      <td>127.430000</td>\n",
       "      <td>3.650000</td>\n",
       "      <td>-0.420000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>84.500000</td>\n",
       "      <td>618.610000</td>\n",
       "      <td>33.550000</td>\n",
       "      <td>81.910000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              power         range         angle      velocity      status_1  \\\n",
       "count  38519.000000  38519.000000  38519.000000  38519.000000  38519.000000   \n",
       "mean      52.683208    102.321735      0.578047     -7.799162      0.495002   \n",
       "std        7.805274     93.259543      7.811847     17.561186      0.499982   \n",
       "min       30.500000      0.000000    -36.850000    -81.920000      0.000000   \n",
       "25%       47.000000     37.070000     -2.800000    -17.420000      0.000000   \n",
       "50%       52.000000     76.400000      0.400000     -7.320000      0.000000   \n",
       "75%       57.500000    127.430000      3.650000     -0.420000      1.000000   \n",
       "max       84.500000    618.610000     33.550000     81.910000      1.000000   \n",
       "\n",
       "           status_2      status_3      status_4  weak detection  \n",
       "count  38519.000000  38519.000000  38519.000000    38519.000000  \n",
       "mean       0.098730      0.101327      0.304940        0.057945  \n",
       "std        0.298304      0.301765      0.460388        0.233643  \n",
       "min        0.000000      0.000000      0.000000        0.000000  \n",
       "25%        0.000000      0.000000      0.000000        0.000000  \n",
       "50%        0.000000      0.000000      0.000000        0.000000  \n",
       "75%        0.000000      0.000000      1.000000        0.000000  \n",
       "max        1.000000      1.000000      1.000000        1.000000  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.2.3 看看data分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0, 0.5, '# of features')"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEKCAYAAAAFJbKyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAGYNJREFUeJzt3Xm0pVV95vHvI+CEA1PB0ipsIKngkCiS2wiitoJBUAR0qSHLoYKYao1pNWmXgumWBGNHl3YcOpGWOKFGhcYBVBZYC0HbtAO3lBkJ5QQVkCotxIHlgPz6j3dfOZS37j1v3Tp3OPf7Weusc9599nvO3lzufWq/w96pKiRJGta9FroBkqSlxeCQJPVicEiSejE4JEm9GBySpF4MDklSLwaHJKmXkQZHkt2SnJvkm0muS3JYkj2SrEtyQ3vevdVNkncm2ZDkyiQHD3zOmlb/hiRrRtlmSdLMRj3ieAdwYVU9HHgMcB1wCnBxVa0GLm7bAMcAq9tjLXAGQJI9gNOAxwGHAKdNhY0kaf5lVHeOJ3kQcAVwQA18SZLrgSdX1S1JHgJcWlUHJnl3e/3RwXpTj6r6z638HvWms9dee9V+++03kn5J0rhav379D6pqxWz1dh5hGw4ANgPvT/IYYD3wSmCfqroFoIXH3q3+SuCmgf03trJtld9DkrV0IxUe9rCHMTk5uWN7I0ljLsn3hqk3ykNVOwMHA2dU1WOBn3H3YanpZJqymqH8ngVVZ1bVRFVNrFgxa2BKkrbTKINjI7Cxqr7ats+lC5Jb2yEq2vOmgfr7Duy/Crh5hnJJ0gIYWXBU1feBm5Ic2IqOBK4FzgemroxaA5zXXp8PvKhdXXUocHs7pHURcFSS3dtJ8aNamSRpAYzyHAfAfwH+Jcm9gW8DJ9GF1TlJTgZuBJ7b6l4APB3YANzR6lJVW5K8Abis1Tu9qraMuN2SpG0Y2VVVC2liYqI8OS5J/SRZX1UTs9XzznFJUi8GhySpF4NDktSLwSFJ6mXUV1VJI5Ppbg0d0hheEyLNG0cckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1MtLgSPLdJFcluTzJZCvbI8m6JDe0591beZK8M8mGJFcmOXjgc9a0+jckWTPKNkuSZjYfI46nVNVBVTXRtk8BLq6q1cDFbRvgGGB1e6wFzoAuaIDTgMcBhwCnTYWNJGn+LcShquOBs9rrs4ATBso/WJ2vALsleQjwNGBdVW2pqtuAdcDR891oSVJn1MFRwOeSrE+ytpXtU1W3ALTnvVv5SuCmgX03trJtld9DkrVJJpNMbt68eQd3Q+MmmdtDWs52HvHnH15VNyfZG1iX5Jsz1J3u17FmKL9nQdWZwJkAExMTv/W+JGnHGOmIo6pubs+bgE/SnaO4tR2Coj1vatU3AvsO7L4KuHmGcknSAhhZcCTZNckDp14DRwFXA+cDU1dGrQHOa6/PB17Urq46FLi9Hcq6CDgqye7tpPhRrUyStABGeahqH+CT6Q4I7wx8pKouTHIZcE6Sk4Ebgee2+hcATwc2AHcAJwFU1ZYkbwAua/VOr6otI2y3JGkGqRq/0wETExM1OTm50M3QiC3kSeox/LWRSLJ+4NaJbfLOcUlSLwaHJKkXg0OS1IvBIUnqxeCQJPVicEiSejE4JEm9GBySpF5GPcmhNCNnmpWWHkcckqReDA5JUi8eqtKcebhJWl4ccUiSejE4JEm9GBySpF4MDklSL54cl7bDXC4IcBEoLXWOOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReRh4cSXZK8o0kn2nb+yf5apIbkpyd5N6t/D5te0N7f7+Bzzi1lV+f5GmjbrMkadvmY8TxSuC6ge03A2+rqtXAbcDJrfxk4Laq+l3gba0eSR4JnAg8CjgaeFeSneah3ZKkaYw0OJKsAp4BvKdtBzgCOLdVOQs4ob0+vm3T3j+y1T8e+FhV/aKqvgNsAA4ZZbslSds26hHH24HXAHe17T2BH1XVnW17I7CyvV4J3ATQ3r+91f9N+TT7SJLm2ciCI8mxwKaqWj9YPE3VmuW9mfYZ/L61SSaTTG7evLl3eyVJwxnliONw4Lgk3wU+RneI6u3Abkmm1gFZBdzcXm8E9gVo7z8Y2DJYPs0+v1FVZ1bVRFVNrFixYsf3RpIEjDA4qurUqlpVVfvRndz+fFU9H7gEeE6rtgY4r70+v23T3v98VVUrP7FddbU/sBr42qjaLUma2azBkWTXJPdqr38vyXFJdpnDd74W+KskG+jOYby3lb8X2LOV/xVwCkBVXQOcA1wLXAi8vKp+PYfvlyTNQWqWdSyTrAeeCOwOfAWYBO5oo4dFaWJioiYnJxe6GcvGXJZRXY5cOlaLVZL1VTUxW71hDlWlqu4Ang38r6p6FvDIuTZQkrQ0DRUcSQ4Dng98tpXtPEN9SdIYGyY4XgWcCnyyqq5JcgDdCW5J0jI068ihqr4AfCHJrm3728ArRt0wSdLiNMxVVYcluZY231SSxyR518hbJklalIY5VPV24GnADwGq6grgSaNslDTOku1/SIvBUDcAVtVNWxV5H4UkLVPDXB11U5LHA9XWzngF95wmXZK0jAwz4ngp8HK6GWk3Age1bUnSMjTjiKMtmPTCxXyXuCRpfs044mhzQh0/T22RJC0Bw5zj+Nck/wicDfxsqrCqvj6yVkmSFq1hguPx7fn0gbKiW19DkrTMDHPn+FPmoyGSpKVh1uBI8vrpyqvq9OnKJUnjbZhDVT8beH1f4Fi8j0OSlq1hDlX9z8HtJG+lW85VkrQMbc+a4/cHDtjRDZEkLQ3DnOO4iu4qKoCdgBXAG0bZKEnS4jXMOY5jB17fCdxaVXeOqD2SpEVumENVf1dV32uPf6+qO5N8aOQtkyQtSsMEx6MGN5LsDPzhaJojSVrsthkcSU5N8hPg0Ul+3B4/AW4Fzpu3FkqSFpVtBkdV/X1VPRB4S1U9qD0eWFV7VtWp89hGSdIiMsx9HKcm2R1YTXcD4FT5F0fZMEnS4jTM5bgvAV4JrAIuBw4FvoyTHErSsjTMyfFXAv8R+F6b8PCxwOaRtkqStGgNExw/r6qfAyS5T1V9Ezhwtp2S3DfJ15JckeSaJH/byvdP8tUkNyQ5u61jTpL7tO0N7f39Bj7r1FZ+fZKnbU9HJUk7xjDBsTHJbsCngHVJzgNuHmK/XwBHVNVj6NYpPzrJocCbgbdV1WrgNuDkVv9k4Laq+l3gba0eSR4JnEh3WfDRwLvakraSpAUwa3BU1bOq6kdV9TfAfwfeC5wwxH5VVT9tm7u0x9QCUOe28rMGPuv4tk17/8gkaeUfq6pfVNV3gA3AIUP0TZI0AkNNcpjkCUlOqqov0J0YXznkfjsluRzYBKwDvgX8aGDKko0Dn7USuAmgvX87sOdg+TT7SJLm2azBkeQ04LXA1L0buwAfHubDq+rXVXUQ3RVZhwCPmK7a1Fdt471tlW/dzrVJJpNMbt7suXtJGpVhRhzPAo6jLehUVTcDD+zzJVX1I+BSukt5d2vTlkAXKFPnSzYC+8JvpjV5MLBlsHyafQa/48yqmqiqiRUrVvRpniSph2GC45dVVbR/5SfZdZgPTrKinVQnyf2Ap9KtHHgJ8JxWbQ13T19yftumvf/59r3nAye2q672p7sR8WvDtEGStOMNM636OUneTTdS+DPgxcA/D7HfQ4Cz2hVQ9wLOqarPJLkW+FiSvwO+QXeynfb8oSQb6EYaJwJU1TVJzgGupZvW/eVV9evhuyhJ2pHS/aN+mje6ezZ+0V7/EXAU3fmGi6pq3fw1sb+JiYmanJxc6GYsG5nuLJRGYhu/rtIOkWR9VU3MVm+mEceXgYOTfKiqXkh3VZQkaZmbKTjunWQN8Pgkz976zar6xOiaJUlarGYKjpcCzwd2A5651XsFGByStAxtMziq6kvAl5JMVtV7t1VPkrS8DDPliKEhSfqNoaYckSRpykxrjh/enu8zf82RJC12M4043tmevzwfDZEkLQ0zXVX1qyTvB1YmeefWb1bVK0bXLEnSYjVTcBxLN7/UEcD6+WmOJGmxm+ly3B/QzSl1XVVdMY9tkiQtYsNcVfXDJJ9MsinJrUk+nmTVyFsmSVqUhgmO99NNbf5QupX3Pt3KJEnL0DDBsXdVvb+q7myPDwCulCRJy9QwwbE5yQva+uE7JXkB8MNRN0yStDgNExwvBp4HfB+4hW51vhePslGSpMVr1hUAq+pGujXHJUlyripJUj8GhySpF4NDktTLrMGR5L8NvHamXEla5maaVv01SQ6ju4pqijPljqlk+x+SlpeZrqq6HngucECS/wtcB+yZ5MCqun5eWidJWnRmOlR1G/A6YAPwZO5en+OUJP9vxO2SJC1SM404jgZOA34H+AfgCuBnVXXSfDRMkrQ4bXPEUVWvq6ojge8CH6YLmRVJvpTk0/PUPknSIjPrnePARVV1GXBZkpdV1ROS7DXqhkmSFqdZL8etqtcMbP5pK/vBbPsl2TfJJUmuS3JNkle28j2SrEtyQ3vevZUnyTuTbEhyZZKDBz5rTat/Q5I1fTspSdpxet0A2HMlwDuB/1pVjwAOBV6e5JHAKcDFVbUauLhtAxwDrG6PtcAZ0AUN3bmWxwGHAKdNhY0kaf6N7M7xqrqlqr7eXv+E7nLelcDxwFmt2lnACe318cAHq/MVYLckDwGeBqyrqi1VdRuwju7EvSRpAczLlCNJ9gMeC3wV2KeqboEuXIC9W7WVwE0Du21sZdsqlyQtgJEHR5IHAB8HXlVVP56p6jRlNUP51t+zNslkksnNmzdvX2MlSbMaaXAk2YUuNP6lqj7Rim9th6Boz5ta+UZg34HdVwE3z1B+D1V1ZlVNVNXEihWubKvx5NQwWgxGFhxJArwXuK6q/mHgrfOBqSuj1gDnDZS/qF1ddShwezuUdRFwVJLd20nxo1qZJGkBDHMfx/Y6HHghcFWSy1vZ64A3AeckORm4kW4+LIALgKfTTXFyB3ASQFVtSfIG4LJW7/Sq2jLCdkuSZpCq3zpdsORNTEzU5OTkQjdjSfFQxvgbw1917WBJ1lfVxGz1XMhJktSLwSFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi8GhySpF4NDktTLKKcckbSIzHV2AO881xRHHJKkXgwOSVIvBockqReDQ5LUi8EhSerF4JAk9WJwSJJ68T6OMTGXa/S9Pl9SH444JEm9GBySpF4MDklSLwaHJKkXg0OS1IvBIUnqxeCQJPVicEiSejE4JEm9jCw4krwvyaYkVw+U7ZFkXZIb2vPurTxJ3plkQ5Irkxw8sM+aVv+GJGtG1V5J0nBGOeL4AHD0VmWnABdX1Wrg4rYNcAywuj3WAmdAFzTAacDjgEOA06bCRpK0MEYWHFX1RWDLVsXHA2e112cBJwyUf7A6XwF2S/IQ4GnAuqraUlW3Aev47TCSJM2j+T7HsU9V3QLQnvdu5SuBmwbqbWxl2yqXJC2QxXJyfLq5XWuG8t/+gGRtkskkk5s3b96hjZMk3W2+g+PWdgiK9ryplW8E9h2otwq4eYby31JVZ1bVRFVNrFixYoc3XFruku1/aLzMd3CcD0xdGbUGOG+g/EXt6qpDgdvboayLgKOS7N5Oih/VyiRJC2RkCzkl+SjwZGCvJBvpro56E3BOkpOBG4HntuoXAE8HNgB3ACcBVNWWJG8ALmv1Tq+qrU+4S5LmUWoMl3+bmJioycnJhW7GvJrrCoAeTtAojeGfmbGUZH1VTcxWb7GcHJckLREGhySpF4NDktSLwSFJ6sXgkCT1YnBIknoZ2X0ckjRlrpeLa3FxxCFJ6sURxyLiTXiSlgJHHJKkXgwOSVIvBockqRfPcUha1Lwia/FxxCFJ6sXgkCT1YnBIknoxOCRJvRgckqReDA5JUi8GhySpF4NDktSLwSFJ6sXgkCT1YnBIknpxripJY8t5rkbDEYckqRdHHDuYq/hJGndLZsSR5Ogk1yfZkOSUhW6PJC1XS2LEkWQn4J+APwI2ApclOb+qrl3YlkkaV3M9ejDO50iWyojjEGBDVX27qn4JfAw4flRflmz/Q5LG3ZIYcQArgZsGtjcCj1ugtkjSrBbqH5LzMdJZKsEx3Y/gHv95kqwF1rbNnya5vud37AX8YDvatlRss39jNFJatj/DMTHu/YN56OMcf5//wzCVlkpwbAT2HdheBdw8WKGqzgTO3N4vSDJZVRPbu/9iN+79g/Hvo/1b+salj0vlHMdlwOok+ye5N3AicP4Ct0mSlqUlMeKoqjuT/AVwEbAT8L6qumaBmyVJy9KSCA6AqroAuGCEX7Hdh7mWiHHvH4x/H+3f0jcWfUyN88XGkqQdbqmc45AkLRLLLjiS3DfJ15JckeSaJH/byvdP8tUkNyQ5u52EX7KS7JTkG0k+07bHrX/fTXJVksuTTLayPZKsa31cl2T3hW7n9kqyW5Jzk3wzyXVJDhuz/h3YfnZTjx8nedWY9fEv29+Yq5N8tP3tGYvfw2UXHMAvgCOq6jHAQcDRSQ4F3gy8rapWA7cBJy9gG3eEVwLXDWyPW/8AnlJVBw1c3ngKcHHr48Vte6l6B3BhVT0ceAzdz3Js+ldV17ef3UHAHwJ3AJ9kTPqYZCXwCmCiqn6f7qKeExmT38NlFxzV+Wnb3KU9CjgCOLeVnwWcsADN2yGSrAKeAbynbYcx6t8MjqfrGyzhPiZ5EPAk4L0AVfXLqvoRY9K/aRwJfKuqvsd49XFn4H5JdgbuD9zCmPweLrvggN8cxrkc2ASsA74F/Kiq7mxVNtJNc7JUvR14DXBX296T8eofdGH/uSTr26wBAPtU1S0A7XnvBWvd3BwAbAbe3w43vifJroxP/7Z2IvDR9nos+lhV/w68FbiRLjBuB9YzJr+HyzI4qurXbYi8im4CxUdMV21+W7VjJDkW2FRV6weLp6m6JPs34PCqOhg4Bnh5kictdIN2oJ2Bg4EzquqxwM9YoodsZtOO8R8H/J+FbsuO1M7NHA/sDzwU2JXu/9WtLcnfw2UZHFPa8P9S4FBgtzakhGmmNFlCDgeOS/JdulmEj6AbgYxL/wCoqpvb8ya6Y+OHALcmeQhAe960cC2ck43Axqr6ats+ly5IxqV/g44Bvl5Vt7btcenjU4HvVNXmqvoV8Ang8YzJ7+GyC44kK5Ls1l7fj+4HfB1wCfCcVm0NcN7CtHBuqurUqlpVVfvRHQL4fFU9nzHpH0CSXZM8cOo1cBRwNd00NGtatSXbx6r6PnBTkgNb0ZHAtYxJ/7byJ9x9mArGp483AocmuX87xzj1MxyL38NldwNgkkfTnZTaiS44z6mq05McQPcv9D2AbwAvqKpfLFxL5y7Jk4FXV9Wx49S/1pdPts2dgY9U1RuT7AmcAzyM7hf3uVW1ZYGaOSdJDqK7uOHewLeBk2j/vzIG/QNIcn+65RIOqKrbW9k4/Qz/Fvhj4E6637mX0J3TWPK/h8suOCRJc7PsDlVJkubG4JAk9WJwSJJ6MTgkSb0YHJKkXgwOjZV2n86X2oykJwyUn5fkoXP87Aum7gHqsc9Lk7xoLt+7vZJcmmTJr2+txWfJrAAoDelP6O7T+RhwIfCpJM+kuzt5TnfpVtXTt2Of/z2X75QWI0ccGje/Au4H3Ae4q03v8CrgLdvaIckHkpyR5JIk307yn5K8r62D8YGBet9Nsle7c/2z6dZ0uTrJH7f335Tk2iRXJnlrK/ubJK9ury9N8uZ068H8W5IntvL7Jzmn7Xd2W69hYqs2HpPknIHtJyf5dHt9RpLJDKwvM00ffzrw+jlT/WojtI8nuaw9Du/x31rLlCMOjZuPtMeLgNcCfw58sKrumGW/3enm9ToO+DTdnF8vAS5LclBVXT5Q92jg5qp6BkCSByfZA3gW8PCqqhkOae1cVYckeTpwGt2UN38O3FZVj07y+8Dl0+y3Dnh3kl2r6md0dySf3d7766rakmQn4OIkj66qK2fp75R30K0P8aUkDwMuYvpJP6XfcMShsVJVt1fVM9riTl8HjgU+nuSf062od9g2dv10ddMoXAXcWlVXVdVdwDXAflvVvQp4ahs9PLFNl/Fj4OfAe5I8m25houl8oj2vH/jcJ9AdWqOqrgZ+649+m4r7QuCZbRT1DO6e5+h5Sb5ON4XFo4BHbuO7p/NU4B/bMgPnAw+amgdM2haDQ+Ps9cAb6c57rAdeDPyPbdSdmi/oroHXU9v3GJlX1b/RrVp3FfD3SV7f/rAfAnycbnGeC2f5nl8PfO50095P52zgeXQjo8uq6idJ9gdeDRxZVY8GPgvcd5p9B+cWGnz/XsBhU6vxVdXKqvrJkO3RMmVwaCwlWQ08tKq+QLf62l10fzyn+6Pa97MfCtxRVR+mW6zn4CQPAB5cVRfQnVM5qMdHfokuEEjySOAPtlHvUrrp1f+Muw9TPYhuvY7bk+zD9Gs+QDdd+SOS3IvukNqUzwF/MdC3Pu3WMuU5Do2rNwJ/3V5/FPgU3Trsr98Bn/0HwFuS3EV3Mv5lwAOB85Lcl24E8Zc9Pu9dwFlJrqQ73HQl3Ypx91BVv07yGeBPaVOPV9UVSb5Bd0jt28C/buM7TgE+Qzcb7dXAA1r5K4B/at+9M/BF4KU92q5lyNlxpQXWTmrvUlU/T/I7wMXA71XVLxe4adK0HHFIC+/+wCVJdqEbrbzM0NBi5ohDktSLJ8clSb0YHJKkXgwOSVIvBockqReDQ5LUi8EhSerl/wM3ZLHAG33kAgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# The distribution of features with missing values\n",
    "plt.hist(features.power,20,facecolor='b')\n",
    "plt.xlabel(\"% missing value\")\n",
    "plt.ylabel(\"# of features\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.2.4. 数据正态"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\data.py:334: DataConversionWarning: Data with input dtype float16 were all converted to float64 by MinMaxScaler.\n",
      "  return self.partial_fit(X, y)\n"
     ]
    }
   ],
   "source": [
    "# normalized\n",
    "scaler = preprocessing.MinMaxScaler() # max min scaling\n",
    "X_n = scaler.fit_transform(X) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "1.2 分配train 和text data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define scoring function (performance metrics)\n",
    "def score_model(clf):\n",
    "    print (\"\\nClassifier: {}...\".format(clf.__class__.__name__))\n",
    "    start = time.time()\n",
    "    # use 3-fold CV\n",
    "    scores = cross_validate(clf, X_train, Y_train, scoring='accuracy', cv=3) \n",
    "    end = time.time()\n",
    "    print (\"time (secs): {:.3f}\".format(end - start))\n",
    "    print (\"roc_auc: {:.3f}\".format(scores.mean()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = tree.DecisionTreeClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = cross_validate(clf, X_train, Y_train, scoring='accuracy', cv=3) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8729065711820411"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(scores['test_score'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Classifier: DecisionTreeClassifier...\n",
      "time (secs): 0.163\n"
     ]
    },
    {
     "ename": "AttributeError",
     "evalue": "'DeprecationDict' object has no attribute 'mean'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-144-3deeb9126b6c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mscores\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;31m# Decision Tree\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mscores\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'tree'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mscore_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtree\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDecisionTreeClassifier\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m \u001b[1;31m# naive bayes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m \u001b[1;31m#scores['gaussian'] = score_model(naive_bayes.GaussianNB())\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<ipython-input-143-d606fcaf8f2e>\u001b[0m in \u001b[0;36mscore_model\u001b[1;34m(clf)\u001b[0m\n\u001b[0;32m      7\u001b[0m     \u001b[0mend\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      8\u001b[0m     \u001b[0mprint\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;34m\"time (secs): {:.3f}\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mend\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m     \u001b[0mprint\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;34m\"roc_auc: {:.3f}\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mscores\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m: 'DeprecationDict' object has no attribute 'mean'"
     ]
    }
   ],
   "source": [
    "# Compare different algrithem\n",
    "scores = {}\n",
    "# Decision Tree\n",
    "scores['tree'] = score_model(tree.DecisionTreeClassifier()) \n",
    "# naive bayes\n",
    "#scores['gaussian'] = score_model(naive_bayes.GaussianNB())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Variable                 Type                      Data/Info\n",
      "------------------------------------------------------------\n",
      "AdaBoostClassifier       ABCMeta                   <class 'sklearn.ensemble.<...>ting.AdaBoostClassifier'>\n",
      "DecisionTreeClassifier   ABCMeta                   <class 'sklearn.tree.tree<...>.DecisionTreeClassifier'>\n",
      "GridSearchCV             ABCMeta                   <class 'sklearn.model_sel<...>on._search.GridSearchCV'>\n",
      "KFold                    ABCMeta                   <class 'sklearn.model_selection._split.KFold'>\n",
      "LogisticRegression       type                      <class 'sklearn.linear_mo<...>stic.LogisticRegression'>\n",
      "PCA                      ABCMeta                   <class 'sklearn.decomposition.pca.PCA'>\n",
      "PLSRegression            ABCMeta                   <class 'sklearn.cross_dec<...>tion.pls_.PLSRegression'>\n",
      "RandomForestClassifier   ABCMeta                   <class 'sklearn.ensemble.<...>.RandomForestClassifier'>\n",
      "SVC                      ABCMeta                   <class 'sklearn.svm.classes.SVC'>\n",
      "StandardScaler           type                      <class 'sklearn.preproces<...>ing.data.StandardScaler'>\n",
      "X                        DataFrame                        power       range <...>n[38519 rows x 9 columns]\n",
      "X_n                      ndarray                   38519x9: 346671 elems, type `float64`, 2773368 bytes (2.6448898315429688 Mb)\n",
      "X_test                   DataFrame                        power       range <...>n[30816 rows x 9 columns]\n",
      "X_train                  DataFrame                        power       range <...>\\n[7703 rows x 9 columns]\n",
      "X_train_normalized       ndarray                   38519x9: 346671 elems, type `float64`, 2773368 bytes (2.6448898315429688 Mb)\n",
      "Y                        DataFrame                        labels\\n0         <...>n[38519 rows x 1 columns]\n",
      "Y_test                   DataFrame                        labels\\n17304     <...>n[30816 rows x 1 columns]\n",
      "Y_train                  DataFrame                        labels\\n27947     <...>\\n[7703 rows x 1 columns]\n",
      "a                        Series                    labels    8263\\ndtype: int64\n",
      "auc                      function                  <function auc at 0x000001E7CA09E510>\n",
      "clf                      DecisionTreeClassifier    DecisionTreeClassifier(cl<...>         splitter='best')\n",
      "cross_val_score          function                  <function cross_val_score at 0x000001E7CA09E400>\n",
      "cross_validate           function                  <function cross_validate at 0x000001E7CA093BF8>\n",
      "df                       DataFrame                        label\\n0        Na<...>n[38519 rows x 1 columns]\n",
      "ensemble                 module                    <module 'sklearn.ensemble<...>\\\\ensemble\\\\__init__.py'>\n",
      "features                 DataFrame                        power       range <...>n[38519 rows x 9 columns]\n",
      "labels                   DataFrame                        labels\\n0         <...>n[38519 rows x 1 columns]\n",
      "linear_model             module                    <module 'sklearn.linear_m<...>near_model\\\\__init__.py'>\n",
      "metrics                  module                    <module 'sklearn.metrics'<...>n\\\\metrics\\\\__init__.py'>\n",
      "n_1                      int64                     8263\n",
      "n_all                    int                       38519\n",
      "n_data                   int                       38519\n",
      "n_features               int                       9\n",
      "n_sat                    int                       38519\n",
      "n_unsat                  int                       38519\n",
      "naive_bayes              module                    <module 'sklearn.naive_ba<...>sklearn\\\\naive_bayes.py'>\n",
      "np                       module                    <module 'numpy' from 'C:\\<...>ges\\\\numpy\\\\__init__.py'>\n",
      "pandas                   module                    <module 'pandas' from 'C:<...>es\\\\pandas\\\\__init__.py'>\n",
      "pd                       module                    <module 'pandas' from 'C:<...>es\\\\pandas\\\\__init__.py'>\n",
      "plt                      module                    <module 'matplotlib.pyplo<...>\\\\matplotlib\\\\pyplot.py'>\n",
      "preprocessing            module                    <module 'sklearn.preproce<...>processing\\\\__init__.py'>\n",
      "roc_curve                function                  <function roc_curve at 0x000001E7CA09E8C8>\n",
      "sat_rate                 float64                   21.451751083880684\n",
      "scaler                   MinMaxScaler              MinMaxScaler(copy=True, feature_range=(0, 1))\n",
      "score_model              function                  <function score_model at 0x000001E7CF0689D8>\n",
      "scores                   DeprecationDict           {'fit_time': array([0.050<...>re': array([1., 1., 1.])}\n",
      "sns                      module                    <module 'seaborn' from 'C<...>s\\\\seaborn\\\\__init__.py'>\n",
      "test_X                   DataFrame                        power       range <...>n[30816 rows x 9 columns]\n",
      "test_Y                   DataFrame                        labels\\n3143      <...>n[30816 rows x 1 columns]\n",
      "time                     module                    <module 'time' (built-in)>\n",
      "train_X                  DataFrame                        power       range <...>\\n[7703 rows x 9 columns]\n",
      "train_Y                  DataFrame                        labels\\n19238     <...>\\n[7703 rows x 1 columns]\n",
      "train_test_split         function                  <function train_test_split at 0x000001E7CA093AE8>\n",
      "tree                     module                    <module 'sklearn.tree' fr<...>earn\\\\tree\\\\__init__.py'>\n",
      "val                      int64                     8263\n",
      "vectors                  DataFrame                        labels\\n0         <...>n[38519 rows x 1 columns]\n"
     ]
    }
   ],
   "source": [
    "%whos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 调参 grid search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 通过learning curve 判断overfit 或 underfit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(source_X, Y, test_size = 0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pandas.read_csv(\"train.csv\")\n",
    "df_test  = pandas.read_csv(\"test.csv\")   \n",
    "print(\"Data loaded\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#1.1 载入数据\n",
    "1.2探索数据分析 exporaty data analyis\n",
    "看看datasize\n",
    "1.3 看卡data分布\n",
    "1.4. normal\n",
    "\n",
    "2. 归一化（normalization）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "2. 数据训练\n",
    "1\n",
    "1.定义模型\n",
    "2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "v = open(\"train_vectors.csv\")\n",
    "source_X = pd.read_csv(v)\n",
    "l = open(\"train_labels.csv\")\n",
    "source_Y = pd.read_csv(l)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "探索数据分析 exporaty data analyis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.6072752746858379\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5687876177878946\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(source_X, source_Y, test_size = 0.5)\n",
    "LR = LogisticRegression()\n",
    "scores = cross_val_score(LR, source_X, source_Y.values.ravel(), cv = 5, scoring = 'recall')\n",
    "print(scores.mean())\n",
    "\n",
    "RF = RandomForestClassifier()\n",
    "scores = cross_val_score(RF, source_X, source_Y.values.ravel(), cv = 5, scoring = 'recall')\n",
    "print(scores.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
